# Replication of Known Unknowns: Media Bias in the Study of Political Violence
# Nick Dietrich (dietrich.nicholas@gmail.com)
# runs repeated random forests to predict media sourcing for GED events
# run this using ucdp_cforest.sh while ged_replication.csv and ucdp_cforest.pbs are in the directory.
# developed in R version 3.4

### WARNING ###
# Takes about 24 hours to run
# requires 10 available cores

# required packages:
#install.packages("party")
#install.packages("doParallel")
#install.packages("foreach")
#install.packages("edarf")

### handle argument passing from the pbs file
args <- commandArgs(trailingOnly = T)
print(args)
if (length(args) != 1) {
  stop("Must give args: ucdp_cforest.pbs --args 'id'")
} else {
  ll <- as.numeric(args)
}

#set the random seed (passed from the pbs and sh files) for replication purposes
set.seed(ll)

#Set the path to the directory where ged_replication.csv is:
setwd("/storage/home/nmd184/cforest_ucdp")
ged <- read.csv("ged_replication.csv",stringsAsFactors=F)

library(party) #party is VERY inefficient, but doesn't oversample on vars w/ many possible values, so gives unbiased var_imp scores
# See Strobl et al. (2007)


# DV MUST BE A FACTOR. We want classification, not regression!!
ged$media <- as.factor(ged$any_media)
# character vectors of independent and dependent variables
ivs <- c("best","year","osv","nonstate","mountains_mean","media_freedom",#"urban_gc","ttime_mean",
         "bdist2","capdist","americas","africa","asia","europe","gcp_ppp_imp",
         "mideast","excluded","polity2","imr_mean",#"statebased","nlights_mean",
         "gdp_pc","ucdp_intl","ucdp_civ","ucdp_intlciv",
         "xrreg","xrcomp","xconst","parreg","parcomp","trade_percent_gdp",
         "v_media_censorship","v_internet_censorship","v_internet_access",
         "v_media_criticism","v_media_diversity","v_journo_harrassment","pop_gpw_sum_imp",
         "v_media_selfcensor","v_mbias_opposition","v_media_corrupt",#"ip4dist",
         "ip6dist","RTI","stateperp")
labs <- c("Best Estimate","Year","One-Sided Violence","Non-State Event",#"Urban Center Travel Time",
          "Mountains","FH Media Freedom","Distance to Border","Distance to Capital",
          "Americas","Africa","Asia","Europe","Grid Cell Economic Output","Middle East",#"Night Lights",
          "Num. Excluded Groups","Polity","Infant Mortality",#"State-Based Event",
          "GDP per Capita","International Conflict",
          "Civil Conflict","Internationalized Civil Conflict","Regulations Exec Recruitment",
          "Competitive Exec Recruitment","Exec Constraints",
          "Regulated Participation","Competitive Participation",
          "Trade (Percent GDP)","Media Censorship (Vdem)","Internet Censorship (Vdem)",
          "Internet Access (Vdem)","Media Criticism (Vdem)","Media Diversity (Vdem)",
          "Journalist Harrasment (Vdem)","Grid Cell Population","Self-Censorship (Vdem)",
          "Media Bias (Vdem)","Media Corruption (Vdem)",
          "Internet Distance (IP6)","Right to Information","State-Perpetrated Event")
dv <- "media" #character vector of dependent variable

# remove india and syria
ged <- ged[which((ged$country_id!=750)&(ged$country_id!=652)),]
ged <- ged[which(ged$year>=2013),]

# remove unused vars to save memory
ged <- ged[,c(dv,ivs)]

# fit the main model
fit <- cforest(as.formula(paste(dv,".",sep="~")),data=ged,
              controls=cforest_unbiased(ntree=1500))
imp.main <- varimp(fit)

write.csv(imp.main,file=paste0("ucdp_cforest_imp_",ll,".csv"))

# save these objects to a workspace image:
save.image(file="ucdp_cforest.RData")

# in parallel, take bootstrap samples and get varimp scores
library(doParallel)
registerDoParallel(cores=10)
library(foreach)
library(edarf)
imp.boot <- foreach(i=1:20, .combine=rbind) %dopar% {
  #tmp <- ged[sample(x=nrow(ged),size=nrow(ged),replace=T),]
  this.fit <- cforest(as.formula(paste(dv,".",sep="~")),data=ged,
                      controls=cforest_unbiased(ntree=1500))
  write.csv(partial_dependence(this.fit,vars=ivs,n=c(10,25)),
            file=paste0("cforest_pd_boot_",ll,"_",i,".csv"))
  varimp(this.fit)
}

write.csv(imp.boot,file=paste0("cforest_imp_boot_",ll,".csv"))